home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Celestin Apprentice 5
/
Apprentice-Release5.iso
/
Source Code
/
C
/
Applications
/
Python 1.3.3
/
Python 133 SRC
/
Demo
/
www
/
waisqp.py
< prev
next >
Wrap
Text File
|
1996-03-12
|
9KB
|
438 lines
# Parser for WAIS question files.
# The syntax is (I am really making this up, there is no decent grammar):
# file: node
# node: record | list
# record: '(' keyword (keyword value)* ')'
# list: '(' record* ')'
# value: string | keyword | othertoken | node | '#' bytelist
# bytelist: '(' number* ')'
# Tokens are really almost anything, only strings are treated special;
# keywords are tokens starting with ':'.
import regex
import string
# Class representing a record.
# This is accessed as if it is a dictionary.
# Limited sequential access is also supported: "for k, v in r: ..."
#
class Record:
#
def __init__(self, type):
self.type = type
self.itemlist = []
#
def __repr__(self):
s = '(:' + self.type + '\n'
for k, v in self.itemlist:
v = str(v)
if '\n' in v:
lines = string.splitfields(v, '\n')
v = string.joinfields(lines, '\n ')
s = s + ' :' + k + ' ' + v + '\n'
s = s + ')'
return s
#
def __setitem__(self, keyword, value):
for i in range(len(self.itemlist)):
if keyword == self.itemlist[i][0]:
self.itemlist[i] = (keyword, value)
return
self.itemlist.append((keyword, value))
#
def __delitem__(self, keyword):
for i in range(len(self.itemlist)):
if keyword == self.itemlist[i][0]:
del self.itemlist[i]
return
raise KeyError, 'keyword not in Record: ' + repr(keyword)
#
def __getitem__(self, keyword):
if type(keyword) == type(0):
# Sequence type access
return self.itemlist[keyword]
# Mapping type access
for k, v in self.itemlist:
if k == keyword: return v
raise KeyError, 'keyword not in Record: ' + repr(keyword)
#
def __len__(self):
return len(self.itemlist)
#
def keys(self):
keys = []
for k, v in self.itemlist:
keys.append(k)
return keys
#
def has_key(self, keyword):
for k, v in self.itemlist:
if k == keyword: return 1
return 0
#
def gettype(self):
return self.type
# Class representing a list of values.
#
class List:
#
def __init__(self, *args):
self.list = []
for item in args:
self.list.append(item)
#
def __repr__(self):
s = '(\n'
for item in self.list:
item = str(item)
if '\n' in item:
lines = string.splitfields(item, '\n')
item = string.joinfields(lines, '\n ')
s = s + ' ' + item + '\n'
s = s + ')'
return s
#
def append(self, item):
self.list.append(item)
#
def insert(self, i, item):
self.list.insert(i, item)
#
def remove(self, item):
self.list.remove(item)
#
def __len__(self):
return len(self.list)
#
def __getitem__(self, i):
return self.list[i]
#
def __setitem__(self, i, value):
self.list[i] = value
#
def __delitem__(self, i):
del self.list[i]
#
def __getslice__(self, i, j):
new = List()
for item in self.list[i:j]:
new.append(item)
return new
# Class representing a list of bytes.
#
class BytesList:
#
def __init__(self):
self.bytes = ''
#
def __repr__(self):
s = '#('
for byte in self.bytes:
s = s + ' ' + str(ord(byte))
s = s + ' )'
return s
#
def append(self, value):
try:
i = string.atoi(value)
except string.atoi_error:
raise SyntaxError, (value, 'byte')
try:
c = chr(i)
except ValueError:
raise SyntaxError, (value, 'byte in 0..255')
self.bytes = self.bytes + c
# Regular expressions used by the tokenizer, and "compiled" versions
#
wspat = '\([ \t\n\r\f]+\|;.*\n\)*'
tokenpat = '[()#"]\|[^()#"; \t\n\r\f]+'
stringpat = '"\(\\\\.\|[^\\"]\)*"' # "\(\\.\|[^\"]\)*"
wsprog = regex.compile(wspat)
tokenprog = regex.compile(tokenpat)
stringprog = regex.compile(stringpat)
# Parser base class without look-ahead.
# Instantiate each time you want to parse a file.
#
class RealBaseParser:
#
def __init__(self, input):
#
# 'input' should have a parameterless method readline()
# which returns the next line, including trailing '\n',
# or the empty string if there is no more data.
# An open file will do nicely, as does an instance
# of StringInput below.
#
self.input = input
self.lineno = 0
#
# Reset the scanner interface.
#
self.reset()
#
def reset(self):
self.nextline = ''
self.pos = 0
self.tokstart = 0
self.eofseen = 0
#
# The real work of getting a token is done here.
# This is the first place place to look if you think
# the parser is too slow.
#
def getnexttoken(self):
while 1:
k = wsprog.match(self.nextline, self.pos)
if k < 0:
raise SyntaxError, ('', 'whitespace')
self.pos = self.pos + k
k = tokenprog.match(self.nextline, self.pos)
if k >= 0:
break
#
# End of line hit
#
if self.eofseen:
self.nextline = ''
else:
self.nextline = self.input.readline()
self.pos = self.tokstart = 0
if not self.nextline:
if self.eofseen:
raise EOFError
self.eofseen = 1
return ''
self.lineno = self.lineno + 1
#
# Found a token
#
self.tokstart, self.pos = self.pos, self.pos + k
token = self.nextline[self.tokstart:self.pos]
if token == '"':
#
# Get the whole string -- may read more lines
#
k = stringprog.match(self.nextline, self.tokstart)
while k < 0:
cont = self.input.readline()
if not cont:
k = len(self.nextline) - self.tokstart
break
self.nextline = self.nextline + cont
self.lineno = self.lineno + 1
k = stringprog.match(self.nextline, \
self.tokstart)
self.pos = self.tokstart + k
token = self.nextline[self.tokstart:self.pos]
return token
#
# Default error handlers.
#
def reporterror(self, filename, message, fp):
fp.write(filename)
fp.write(':' + `self.lineno` + ': ')
fp.write(message)
fp.write('\n')
self.printerrorline(fp)
#
def printerrorline(self, fp):
line = self.nextline
fp.write(line)
if line[-1:] <> '\n':
fp.write('\n')
for i in range(len(line)):
if i >= self.tokstart:
n = max(1, self.pos - i)
fp.write('^'*n)
break
elif line[i] == '\t':
fp.write('\t')
elif ' ' <= line[i] < '\177':
fp.write(' ')
fp.write('\n')
# Parser base class. Instantiate each time you want to parse a file.
# This supports a single token look-ahead.
#
class BaseParser(RealBaseParser):
#
def reset(self):
RealBaseParser.reset(self)
self.pushback = ''
#
def peektoken(self):
if not self.pushback:
self.pushback = self.getnexttoken()
return self.pushback
#
def gettoken(self):
if self.pushback:
token = self.pushback
self.pushback = ''
else:
token = self.getnexttoken()
if token == '':
raise EOFError
return token
#
def ungettoken(self, token):
if self.pushback:
raise AssertError, 'more than one ungettoken'
# print 'pushback:', token
self.pushback = token
# Parser for a node. Instantiate, and gell getnode() to parse a node.
#
class Parser(BaseParser):
#
# Parse a node. This is highly recursive.
#
def getnode(self):
self.open()
# This can be either a list or a record
if self.peektoken() in ('(', ')'): # It's a list
list = List()
while self.more():
list.append(self.getnode())
self.close()
return list
# Not a list, must be a record
type = self.getkeyword()
rec = Record(type)
while self.more():
keyword = self.getkeyword()
value = self.getvalue()
rec[keyword] = value
self.close()
return rec
#
def getkeyword(self):
t = self.gettoken()
if t[0] <> ':' or t == ':':
raise SyntaxError, (t, ':<keyword>')
return t[1:]
#
def getvalue(self):
t = self.peektoken()
if t == '(':
return self.getnode()
if t == '#':
self.expect('#')
return self.getbyteslist()
if t == ')':
raise SyntaxError, (t, '<value>')
return self.gettoken()
#
def getbyteslist(self):
bytes = BytesList()
self.open()
while self.more():
bytes.append(self.getbyte())
self.close()
return bytes
#
def getbyte(self):
return self.gettoken()
#
# Shorthands for frequently occurring parsing operations
#
def open(self):
self.expect('(')
#
def close(self):
self.expect(')')
#
def expect(self, exp):
t = self.gettoken()
if t <> exp:
raise SyntaxError, (t, exp)
#
def more(self):
if self.peektoken() == ')':
return 0
else:
return 1
# A class to parse from a string
#
class StringInput:
#
def __init__(self, string):
self.string = string
self.pos = 0
#
def __repr__(self):
return '<StringInput instance, string=' + `self.string` \
+ ', pos=' + `self.pos` + '>'
#
def readline(self):
string = self.string
i = self.pos
n = len(string)
while i < n:
if string[i] == '\n':
i = i+1
break
i = i+1
string = string[self.pos : i]
self.pos = i
return string
# Convenience routines to parse a file
#
def parsefile(filename):
f = open(filename, 'r')
p = Parser(f)
result = p.getnode()
f.close()
return result
#
def parse(f):
p = Parser(f)
return p.getnode()
# Test driver for tokenizer -- reads from stdin
#
def testtokenizer():
import sys
p = Parser(sys.stdin)
try:
while 1: p.gettoken()
except EOFError:
print 'EOF'
except SyntaxError, msg:
p.reporterror('<stdin>', 'Syntax error: ' + msg, sys.stderr)
# Test driver for parser -- reads from stdin
#
def testparser():
import sys
p = Parser(sys.stdin)
try:
x = p.getnode()
except EOFError:
print 'unexpected EOF at line', p.lineno
return
except SyntaxError, msg:
if type(msg) == type(()):
gotten, expected = msg
msg = 'got ' + `gotten` + ', expected ' + `expected`
p.reporterror('<stdin>', 'Syntax error: ' + msg, sys.stderr)
return
print x